/**********************************************************************
gc_5_create_estimation_sample.do

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* Specify which gender to process (men = men, women = women) *
local gender = "men"

* Specify first observed age *
local start_age = 25

* Specify occupational definition to use (occ_version = 1 or 2)
local occ_version = 2

* STEP 1: prepare dataset for analysis *
local gate1 = 1

* start log file *
quietly capture log close
quietly log using gc_5_create_estimation_sample_`gender'_start`start_age'_occ_v`occ_version'_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

****************************************
* STEP 1: prepare dataset for analysis *
****************************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: prepare dataset for analysis *****"
	local datadir T:\${data_folder}\

	* Save dataset of postal code areas to be used later for initial conditions *
	use "`datadir'PC_masterfile.dta", clear
	drop if d_t1_pc == "NtFound"
	keep d_t1_pc area_m
	rename d_t1_pc res_pc
	rename area_m res_area_m
	duplicates drop
	compress
	save "`datadir'gc_tmp_res_pc_area.dta", replace
	clear
	
	use "`datadir'gc_individual_panel_`gender'_`ext'.dta", clear
	sort pid tax_yr
	
	if "`gender'" == "men" ///
	{
		keep if sex == 1
	}

	if "`gender'" == "women" ///
	{
		keep if sex == 2
	}
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are `r(N)' individuals in the `gender' dataset..."
	drop counter

	*****************************************
	* keep only selected cohorts: 1978-1985 *
	*****************************************
	display "...Keep only selected cohorts..."
	drop if dob_yr < 2001 - `start_age'
	drop if dob_yr > 2010 - `start_age'
	
	****************************************************
	* keep only observations above a certain age (25+) *
	****************************************************
	drop if age < `start_age'
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	*******************
	* drop immigrants *
	*******************
	display "...Drop first-generation immigrants..."
	drop if inimdb == 1
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	************************************************************************
	* Construct various occupational categories based on sources of income *
	************************************************************************
	* nonemployed *
	gen occ_v1 = 0
	* incorporated *
	replace occ_v1 = 3 if y_3_all >= y_0
	* unincorporated *
	replace occ_v1 = 2 if occ_v1 == 0 & y_2 >= y_0
	* paid employee *
	replace occ_v1 = 1 if occ_v1 == 0 & y_1 >= y_0
	label var occ_v1 "Occupation, v1"
	
	* construct relevant measure of income *
	gen y_v1 = .
	replace y_v1 = y_1 if occ_v1 == 1
	replace y_v1 = y_2 if occ_v1 == 2
	replace y_v1 = y_3_all if occ_v1 == 3
	label var y_v1 "Annual income (main occupation), v1"

	* nonemployed *
	gen occ_v2 = 0
	* incorporated *
	replace occ_v2 = 3 if y_3_startups >= y_0
	* unincorporated *
	replace occ_v2 = 2 if occ_v2 == 0 & y_2 >= y_0
	* paid employee *
	replace occ_v2 = 1 if occ_v2 == 0 & y_1 >= y_0
	label var occ_v2 "Occupation, v2"
	
	* construct relevant measure of income *
	gen y_v2 = .
	replace y_v2 = y_1 if occ_v2 == 1
	replace y_v2 = y_2 if occ_v2 == 2
	replace y_v2 = y_3_startups if occ_v2 == 3
	label var y_v2 "Annual income (main occupation), v2"

	**************************
	* merge firm information *
	**************************
	replace eid_long = "" if occ_v`occ_version' != 1
	rename eid eid_Ind
	merge m:1 eid_long using "`datadir'gc_firm_info_`ext'.dta", keepusing(naics4_int naics3_int naics2_int f*)
	drop if _merge == 2
	rename eid_long eid_Ind_long
	drop _merge
	ds
	if `occ_version' == 1 ///
	{
		rename eid_all eid
	}
	if `occ_version' == 2 ///
	{
		rename eid_startup eid
	}
	merge m:1 eid tax_yr using "`datadir'gc_longitudinal_eid_`ext'.dta"
	drop if _merge == 2
	drop _merge
	replace eid_long = "" if occ_v`occ_version' != 3
	if `occ_version' == 1 ///
	{
		rename eid eid_all
	}
	if `occ_version' == 2 ///
	{
		rename eid eid_startup
	}
	merge m:1 eid_long using "`datadir'gc_firm_info_`ext'.dta", keepusing(naics4_int naics3_int naics2_int) update replace
	drop if _merge == 2
	rename eid_long eid_Bus_long
	drop _merge
	ds
	
	***************************************************
	* drop worker with missing firm class information *
	***************************************************
	display "...Drop worker with missing firm class information..."
	drop if occ_v`occ_version' == 1 & f2_rev_perL_4c == .
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	********************************************************************
	* if focusing on startups, drop entrepreneurs at non-startup firms *
	********************************************************************
	display "...if focusing on startups, drop entrepreneurs at non-startup firms..."
	drop if y_3_all >= y_0 & occ_v`occ_version' != 3
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	******************************
	* sample selection continued *
	******************************
	* drop if first year in the dataset is after 2001 and age in that year is greater than 25 *
	display "...Drop if first year in the dataset is after 2001 and age in that year is greater than `start_age'..."
	sort pid tax_yr
	by pid: egen entry_yr = min(tax_yr)
	gen flag = 0
	replace flag = 1 if entry_yr > 2001 & age > `start_age' & tax_yr == entry_yr
	by pid: egen flag_person = max(flag)
	drop if flag_person == 1
	drop flag*

	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter

	* drop if missing at any point between entry_yr and exit_yr
	display "...Drop if missing at any point between entry_yr and exit_yr..."
	sort pid tax_yr
	by pid: egen exit_yr = max(tax_yr)
	bys pid: gen counter = _N
	gen potential_yrs = exit_yr - entry_yr + 1
	gen flag = 0
	replace flag = 1 if counter < potential_yrs
	by pid: egen flag_person = max(flag)
	drop if flag_person == 1
	drop flag* counter potential_yrs

	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	* drop if observed for less than 3 years *
	display "...Drop if observed for less than 3 years..."
	sort pid tax_yr
	bys pid: gen counter = _N
	drop if counter < 3
	drop counter

	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	* keep only individuals who are employed at age 25.
	display "...Keep only individuals who are employed at `start_age'"
	sort pid tax_yr
	gen flag = 0
	replace flag = 1 if age == `start_age' & y_3_all < y_0 & y_2 < y_0 & y_1 >= y_0
	by pid: egen flag_person = max(flag)
	keep if flag_person == 1
	drop flag*
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	* drop individuals with missing postal code info at age 25.
	display "...Drop individuals with missing postal code info at `start-age'..."
	merge m:1 res_pc using "`datadir'gc_tmp_res_pc_area.dta"
	drop if _merge == 2
	drop _merge
	sort pid tax_yr
	gen flag = 0
	replace flag = 1 if age == `start_age' & res_pc == ""
	replace flag = 1 if age == `start_age' & res_pc_inc == 0
	replace flag = 1 if age == `start_age' & res_area_m == .
	by pid: egen flag_person = max(flag)
	drop if flag_person == 1
	drop flag*
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	* drop individual if entrepreneur at some point but used to work in the company before *
	* also drop individual if worker at some point but used to be entrepreneur in same company before *
	sort pid tax_yr
	by pid: gen period = _n
	forvalues i = 1(1)11 ///
	{
		gen L`i'occ_v`occ_version' = .
		by pid: replace L`i'occ_v`occ_version' = occ_v`occ_version'[_n - `i'] if period > `i'
		gen L`i'eid_Ind_long = ""
		by pid: replace L`i'eid_Ind_long = eid_Ind_long[_n - `i'] if period > `i' & L`i'occ_v`occ_version' == 1
		gen L`i'eid_Bus_long = ""
		by pid: replace L`i'eid_Bus_long = eid_Bus_long[_n - `i'] if period > `i' & L`i'occ_v`occ_version' == 3
	}

	gen flag = 0
	replace flag = 1 if ///
	occ_v`occ_version' == 3 & eid_Bus_long == L1eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L2eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L3eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L4eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L5eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L6eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L7eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L8eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L9eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L10eid_Ind_long | ///
	occ_v`occ_version' == 3 & eid_Bus_long == L11eid_Ind_long
	replace flag = 1 if ///
	occ_v`occ_version' == 1 & eid_Ind_long == L1eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L2eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L3eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L4eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L5eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L6eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L7eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L8eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L9eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L10eid_Bus_long | ///
	occ_v`occ_version' == 1 & eid_Ind_long == L11eid_Bus_long
	by pid: egen flag_person = max(flag)
	drop if flag_person == 1
	drop flag* L*
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	*****************************************
	* Construct initial condition variables *
	*****************************************
	* Average employment income in postal code at age 25 *
	gen aux = res_pc_inc if age == `start_age'
	bys pid: egen res_pc_inc_entry = max(aux)
	drop aux
	count if res_pc_inc_entry == .
	
	* Share of business owners in postal code at age 25 *
	gen aux = res_pc_shr_bus if age == `start_age'
	bys pid: egen res_pc_shr_bus_entry = max(aux)
	drop aux
	count if res_pc_shr_bus_entry == .
	
	* Population density in postal code at age 25 *
	gen aux = res_pc_pop/res_area_m if age == `start_age'
	bys pid: egen res_pc_popden_entry = max(aux)
	drop aux
	count if res_pc_popden_entry == .
	
	* Density of entrepreneurs in postal code at age 25 *
	gen aux = res_pc_busipop/res_area_m if age == `start_age'
	bys pid: egen res_pc_busipopden_entry = max(aux)
	drop aux
	count if res_pc_busipopden_entry == .
	
	* Initial city size *
	gen aux = res_city_size_3c if age == `start_age'
	bys pid: egen res_city_size_entry = max(aux)
	drop aux
	replace res_city_size_entry = 4 if res_city_size_entry == .
	
	* display the number of individuals *
	bys pid: gen counter = _n
	count if counter == 1
	display "...There are now `r(N)' individuals in the `gender' dataset..."
	drop counter
	
	************
	* CLEAN UP *
	************
	* save intermediate dataset to the data_folder *
	sort pid tax_yr
	ds
	compress
	save "`datadir'gc_analyze_me_`gender'_start`start_age'_occ_v`occ_version'_`ext'.dta", replace
	clear
	
	capture erase "`datadir'gc_tmp_res_pc_area.dta"
	
	disp "***** STEP 1: make sample restrictions (COMPLETED)*****"
}
disp "***** Finished processing STEP 1 *****"
endtime

********
* EXIT *
********
clear all
log close
